knitr::opts_chunk$set(message = FALSE, warning = FALSE)

View this report on the web: https://ccbr.github.io/spacesavers2/2024/report_2024-01-17.html

library(bslib)
library(dplyr)
library(DT)
library(fontawesome)
library(ggplot2)
library(glue)
library(here)
library(htmltools)
library(knitr)
library(lubridate)
library(plotly)
library(purrr)
library(readr)
library(rlang)
library(scales)
library(shiny)
library(stringr)
library(tidyr)
theme_set(theme_bw())

to_bytes <- function(x, from_unit) {
  bytes_units <- list(
    KiB = 1,
    MiB = 2,
    GiB = 3,
    TiB = 4
  )
  return(x * (1024^bytes_units[[from_unit]]))
}
from_bytes <- function(x, to_unit) {
  return(x * x / (to_bytes(x, to_unit)))
}

from_bytes_v <- Vectorize(from_bytes)
to_bytes_v <- Vectorize(to_bytes)

filter_users <- function(dat, usercol = username) {
  non_people <- c("allusers", "rpcuser", "slurm")
  dat %>%
    filter(
      !({{ usercol }} %in% non_people), # not actual people
      !str_detect({{ usercol }}, "[0-9]") # entirely numeric usernames
    )
}

is_large_range <- function(x, n_orders_magnitude = 5) {
  xrange <- range(x)
  return((xrange[2] - xrange[1]) >= 10^n_orders_magnitude)
}

plot_user_metric <- function(dat, x_metric) {
  dat %>%
    ggplot(aes(
      x = eval_tidy(data_sym(x_metric)),
      y = username,
      fill = eval_tidy(data_sym(x_metric)),
      text = glue("{username}\n{eval_tidy(data_sym(x_metric))} {x_metric}")
    )) +
    geom_col() +
    # TODO: ggplotly doesn't know what to do with scale::label_log
    # {if (is_large_range(dat %>% pull(x_metric))) scale_x_log10(labels = label_log(digits = 2)) } +
    labs(x = x_metric, y = "") +
    theme(legend.position = "none")
}

plot_metric_time <- function(dat, y_metric) {
  dat %>%
    ggplot(aes(
      x = date,
      y = eval_tidy(data_sym(y_metric)),
      color = username
    )) +
    geom_line(alpha = 0.7) +
    geom_point(aes(text = glue("{username}\n{eval_tidy(data_sym(y_metric))} {y_metric}"))) +
    labs(y = y_metric)
}

panel_summary <- function(dat,
                          folder_path = "/data/CCBR",
                          plot_fcn = plot_metric_time) {
  summary_dat_folder <- dat %>%
    filter(FolderPath == folder_path)
  top_users <- summary_dat_folder %>%
    pivot_longer(all_of(summary_metrics),
      names_to = "metric"
    ) %>%
    mutate(value_adj = case_when(
      metric == "OverallScore" ~ -value,
      TRUE ~ value
    )) %>%
    group_by(metric) %>%
    slice_max(order_by = value_adj, n = n_top_users) %>%
    pull(username) %>%
    unique()
  plots <- summary_metrics %>% lapply(function(y_metric) {
    user_order <- summary_dat_folder %>%
      filter(username %in% top_users) %>%
      pivot_longer(all_of(summary_metrics),
        names_to = "metric"
      ) %>%
      mutate(value_adj = case_when(
        metric == "OverallScore" ~ -value,
        TRUE ~ value
      )) %>%
      filter(metric == y_metric) %>%
      arrange(by = value_adj) %>%
      pull(username) %>%
      unique()
    if (y_metric == "TotalBytes" | y_metric == "DuplicateBytes") {
      to_unit <- "TiB" # TODO: dynamically set based on range of metric
      new_metric_name <- glue("{y_metric}_{to_unit}")
      summary_dat_folder <- summary_dat_folder %>%
        mutate("{new_metric_name}" := from_bytes(eval_tidy(data_sym(y_metric)), to_unit))
      y_metric <- new_metric_name
    } else if (y_metric == "TotalMeanAge" | y_metric == "DuplicateMeanAge") {
      new_metric_name <- glue("{y_metric}_Days")
      summary_dat_folder <- summary_dat_folder %>%
        rename("{new_metric_name}" := y_metric)
      y_metric <- new_metric_name
    } else if (y_metric == "TotalFiles" | y_metric == "DuplicateFiles") {
      new_metric_name <- glue("{y_metric}_Millions")
      summary_dat_folder <- summary_dat_folder %>%
        mutate("{new_metric_name}" := eval_tidy(data_sym(y_metric)) / 10^6)
      y_metric <- new_metric_name
    }
    p <- summary_dat_folder %>%
      filter(username %in% user_order) %>%
      mutate(username = factor(username, levels = user_order)) %>%
      mutate(across(where(is.numeric), round, digits = 2)) %>%
      plot_fcn(y_metric)
    nav_panel(title = y_metric, card_header(y_metric), ggplotly(p, tooltip = "text"))
  })
  nav_panel(
    title = markdown(glue("`{folder_path}`")),
    navset_pill_list(!!!plots)
  )
}
n_top_users <- params$n_top_users
input_dir <- params$input_dir # here("data")
aggregated_filetypes <- c("blamematrix", "catalog", "mimeo")
# TODO: only load last N weeks of data to keep RAM usage reasonably low
all_files <- tibble(filename = list.dirs(input_dir) %>%
  Filter(function(x) {
    x != input_dir
  }, .) %>%
  lapply(function(x) {
    list.files(x, full.names = TRUE)
  }) %>%
  unlist())
user_dat <- all_files %>%
  filter(!str_detect(filename, paste(aggregated_filetypes, collapse = "|"))) %>%
  separate_wider_delim(filename,
    delim = ".", cols_remove = FALSE,
    names = c("date", "path", "username", "file", "ext"),
    too_few = "debug"
  ) %>%
  mutate(date = as_date(basename(date)))

dates <- user_dat %>%
  filter(!is.na(date)) %>%
  pull(date) %>%
  unique()
most_recent_date <- dates %>% max()

total_usage_tb <- user_dat %>%
  filter(
    username == "allusers",
    date == most_recent_date,
    file == "summary",
    path == "_data_CCBR"
  ) %>%
  pull(filename) %>%
  read_tsv() %>%
  filter(FolderPath == "/data/CCBR") %>%
  mutate(disk_usage_tb = from_bytes(TotalBytes, "TiB")) %>%
  pull(disk_usage_tb)
# TODO disk_usage_tb doesn't agree with output from `df`

grubbers_allusers_err <- user_dat %>%
  filter(
    username == "allusers",
    date == most_recent_date,
    file == "grubbers",
    ext == "err",
    path == "_data_CCBR"
  ) %>%
  pull(filename) %>%
  read_lines()
grubbers_message <- grubbers_allusers_err[2] %>%
  str_split(":") %>%
  unlist() %>%
  .[3]

user_dat <- user_dat %>% filter_users()
usernames <- user_dat %>%
  pull(username) %>%
  unique()

summary_dat_recent <- user_dat %>%
  filter(
    date == most_recent_date, file == "summary"
  ) %>%
  pull(filename) %>%
  map(function(x) {
    read_tsv(x) %>% mutate(filename = x)
  }) %>%
  list_rbind() %>%
  separate_wider_delim(filename,
    delim = ".", cols_remove = FALSE,
    names = c("basepath", "path", "username", "file", "ext")
  )
summary_metrics <- summary_dat_recent %>%
  pivot_longer(where(is.numeric), names_to = "metric") %>%
  pull(metric) %>%
  unique()

Total disk usage

disk_usage <- read_tsv(here("results", "disk_usage.txt"))
df_date <- disk_usage %>%
  pull("date") %>%
  as_date()

layout_column_wrap(
  width = 1 / 2,
  value_box(
    title = p(fa("hard-drive"), "  Disk space in /data/CCBR"),
    value = markdown(disk_usage %>%
      mutate(Usage = glue("{Used} / {Size}")) %>%
      select(Usage, `Use%`) %>%
      kable()),
    theme = "warning"
  ),
  value_box(
    title = p(fa("users", prefer_type = "regular"), "  Users"),
    value = p(glue("{length(usernames)} users as of {format(df_date, '%b %d, %Y')}")),
    theme = "primary"
  )
)

Disk space in /data/CCBR

Usage Use%
197T / 200T 99%

Users

36 users as of Oct 17, 2023

Summary over time

Usage by top users for each spacesavers metric.

summary_dat_all <- user_dat %>%
  filter(
    file == "summary"
  ) %>%
  pull(filename) %>%
  map(function(x) {
    read_tsv(x) %>% mutate(filename = x)
  }) %>%
  list_rbind() %>%
  separate_wider_delim(filename,
    delim = ".", cols_remove = FALSE,
    names = c("basepath", "path", "username", "file", "ext")
  ) %>%
  mutate(date = str_replace(basepath, ".*/", "") %>% as_date())

navset_tab(
  summary_dat_all %>% panel_summary("/data/CCBR", plot_metric_time),
  summary_dat_all %>% panel_summary("/data/CCBR/rawdata", plot_metric_time),
  summary_dat_all %>% panel_summary("/data/CCBR/projects", plot_metric_time),
)
TotalBytes_TiB
DuplicateBytes_TiB
PercentDuplicateBytes
TotalFiles_Millions
DuplicateFiles_Millions
PercentDuplicateFiles
TotalMeanAge_Days
DuplicateMeanAge_Days
AgeScore
DupScore
OccScore
OverallScore
TotalBytes_TiB
DuplicateBytes_TiB
PercentDuplicateBytes
TotalFiles_Millions
DuplicateFiles_Millions
PercentDuplicateFiles
TotalMeanAge_Days
DuplicateMeanAge_Days
AgeScore
DupScore
OccScore
OverallScore
TotalBytes_TiB
DuplicateBytes_TiB
PercentDuplicateBytes
TotalFiles_Millions
DuplicateFiles_Millions
PercentDuplicateFiles
TotalMeanAge_Days
DuplicateMeanAge_Days
AgeScore
DupScore
OccScore
OverallScore

Most recent summary (2024-01-15)

Usage by top users for each spacesavers metric.

navset_tab(
  summary_dat_recent %>% panel_summary("/data/CCBR", plot_user_metric),
  summary_dat_recent %>% panel_summary("/data/CCBR/rawdata", plot_user_metric),
  summary_dat_recent %>% panel_summary("/data/CCBR/projects", plot_user_metric),
)
TotalBytes_TiB
DuplicateBytes_TiB
PercentDuplicateBytes
TotalFiles_Millions
DuplicateFiles_Millions
PercentDuplicateFiles
TotalMeanAge_Days
DuplicateMeanAge_Days
AgeScore
DupScore
OccScore
OverallScore
TotalBytes_TiB
DuplicateBytes_TiB
PercentDuplicateBytes
TotalFiles_Millions
DuplicateFiles_Millions
PercentDuplicateFiles
TotalMeanAge_Days
DuplicateMeanAge_Days
AgeScore
DupScore
OccScore
OverallScore
TotalBytes_TiB
DuplicateBytes_TiB
PercentDuplicateBytes
TotalFiles_Millions
DuplicateFiles_Millions
PercentDuplicateFiles
TotalMeanAge_Days
DuplicateMeanAge_Days
AgeScore
DupScore
OccScore
OverallScore

Summary table

allusers_summary <- all_files %>%
  filter(str_detect(filename, "_data_CCBR.allusers.summary.txt")) %>%
  separate_wider_delim(filename,
    delim = ".", cols_remove = FALSE,
    names = c("date", "path", "username", "file", "ext")
  ) %>%
  mutate(date = as_date(basename(date))) %>%
  slice_max(order_by = date) %>%
  pull(filename) %>%
  map(function(x) {
    read_tsv(x)
  }) %>%
  list_rbind() %>%
  mutate(
    TotalBytes_GiB = round(from_bytes_v(TotalBytes, "GiB"), 2),
    DuplicateBytes_GiB = round(from_bytes_v(DuplicateBytes, "GiB"), 2),
    .before = "DuplicateBytes"
  ) %>%
  select(-c(TotalBytes, DuplicateBytes))

card(
  card_header("Summary across all users"),
  datatable(allusers_summary, fillContainer = TRUE)
)
Summary across all users

Blame matrix

blame_matrix <- all_files %>%
  filter(str_detect(filename, "blamematrix")) %>%
  separate_wider_delim(filename,
    delim = ".", cols_remove = FALSE,
    names = c("date", "path", "file", "ext")
  ) %>%
  mutate(date = as_date(basename(date))) %>%
  filter(!is.na(date), file == "blamematrix", ext == "tsv", path == "_data_CCBR") %>%
  slice_max(order_by = date) %>%
  pull(filename) %>%
  map(function(x) {
    read_tsv(x)
  }) %>%
  list_rbind()

card(
  card_header("Disk usage by user in subdirectories"),
  datatable(blame_matrix, fillContainer = TRUE)
)
Disk usage by user in subdirectories

Duplicate files

Deleting top grubbers will save 6.61 TiB!

Potential savings per user

grub_err <- user_dat %>%
  filter_users() %>%
  filter(!is.na(date), file == "grubbers", ext == "err", path == "_data_CCBR") %>%
  slice_max(order_by = date) %>%
  pull(filename) %>%
  map(function(x) {
    read_tsv(x, col_names = FALSE) %>%
      mutate(filename = x)
  }) %>%
  list_rbind() %>%
  filter(str_detect(X1, "Deleting")) %>%
  separate_wider_delim(filename,
    delim = ".", cols_remove = FALSE,
    names = c("date", "path", "username", "file", "ext")
  ) %>%
  mutate(
    date = as_date(basename(date)),
    grub_msg = str_replace_all(X1, regex("^.*:"), ""),
    savings_value = as.numeric(
      str_replace_all(
        grub_msg,
        regex(".*save ([\\d\\.]*) [\\w!]+"),
        "\\1"
      )
    ),
    savings_unit = str_replace_all(
      grub_msg,
      regex(".*save [\\d\\.]* ([\\w]+)!"),
      "\\1"
    ),
    savings_bytes = to_bytes_v(savings_value, savings_unit)
  )

user_grub_table <- grub_err %>%
  arrange(desc(savings_bytes)) %>%
  select(username, savings_value, savings_unit)

card(
  card_header("Savings per user"),
  datatable(user_grub_table, fillContainer = TRUE)
)
Savings per user

All high-value duplicates

grub_dat <- user_dat %>%
  filter_users() %>%
  filter(!is.na(date), file == "grubbers", ext == "tsv", path == "_data_CCBR") %>%
  slice_max(order_by = date) %>%
  pull(filename) %>%
  map(function(x) {
    read_tsv(x, col_names = FALSE) %>%
      mutate(filename = x)
  }) %>%
  list_rbind() %>%
  rename(
    file_hash = X1,
    file_count = X2,
    total_disk_usage = X3,
    single_disk_usage = X4,
    filepaths = X5
  ) %>%
  separate_wider_delim(filename,
    delim = ".", cols_remove = FALSE,
    names = c("date", "path", "username", "file", "ext")
  ) %>%
  mutate(date = as_date(basename(date))) %>%
  filter_users() %>%
  separate_wider_delim(total_disk_usage,
    delim = " ",
    names = c("total_disk_usage_value", "total_disk_usage_unit"),
    cols_remove = FALSE
  ) %>%
  separate_wider_delim(single_disk_usage,
    delim = " ",
    names = c("single_disk_usage_value", "single_disk_usage_unit"),
    cols_remove = FALSE
  ) %>%
  mutate(across(all_of(c("total_disk_usage_value", "single_disk_usage_value")), as.numeric))

top_files <- grub_dat %>%
  arrange(order_by = desc(total_disk_usage_value)) %>%
  select(total_disk_usage_value, username, filepaths) %>%
  rename(disk_usage_gb = total_disk_usage_value)

card(card_header("Top files"), datatable(top_files, fillContainer = TRUE))
Top files

For instructions on how to replace duplicates with hard links, see the usurp command in the spacesavers docs.